In this notebook I import gene-level counts from snow crab RNA-Seq data. Trimmed RNASeq reads were aligned to the tanner crab genome using the STAR aligner, and counts (i.e. the number of reads aligning to each gene) were determined during alignment by STAR using the tanner annotation file. NOTE: before the alignment I converted the .gff annotation file (final_annotation.gff) to a .gtf file (final_annotation_LHS.gtf) for STAR.

Load R packages needed for this notebook

list.of.packages <- c("tidyverse", "reshape2", "here", "plotly", "purrr", "janitor", "readxl", "clipr") #add new libraries here 

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

Read in gene count data

Generate tab-separated file that lists data file names (one file for each library) and sample number

NOTE: this code chunk uses the bash language, not R

rm ../results/star/countsfilenames.txt
for file in ../results/star/*.ReadsPerGene.out.tab
do
filename="$(echo $file)"
sample="$(basename -a $filename | cut -d "." -f 1)"
printf "%s\t%s\n" "$filename" "$sample" >> ../results/star/countsfilenames.txt
done

Preview the contents of the countsfilenames.txt file

head -n 5 ../results/star/countsfilenames.txt
## ../results/star/1.ReadsPerGene.out.tab   1
## ../results/star/10.ReadsPerGene.out.tab  10
## ../results/star/11.ReadsPerGene.out.tab  11
## ../results/star/12.ReadsPerGene.out.tab  12
## ../results/star/13.ReadsPerGene.out.tab  13

Generate object with filenames and sample annotation information, then join

# Create object from the countsfilenames.txt file 
filenames <- read_delim(file="../results/star/countsfilenames.txt", delim = "\t", col_names = c("filename", "sample")) %>%
  mutate(sampleID=paste("S", sample, sep=""))
## Rows: 63 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): filename
## dbl (1): sample
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
files <- file.path(filenames$filename) #extract vector of filenames 
all(file.exists(files)) #easy code to check that all files exist! 
## [1] TRUE
# Create sample.info object with annotation name for each sample/file, this will be used throughout the analysis  
sample.info <- read_excel("../data/Crabs for RNA analysis.xlsx", sheet = "Sheet1", 
                          skip = 8, col_names = c("sample", "pH", "duration"), na = "N/A") %>% 
          mutate_at(vars(pH, duration), as.factor) %>%
  mutate(OA=as.factor(case_when(
    pH=="pH 7.5" ~ "severe",
    pH=="pH 7.8" ~ "moderate",
    pH=="Ambient" ~ "ambient"))) %>%
  mutate(treatment=as.factor(case_when(
    (OA=="severe" & duration=="long") ~ "severe_long",
    (OA=="severe" & duration=="short") ~ "severe_short",
    (OA=="moderate" & duration=="long") ~ "moderate_long",
    (OA=="moderate" & duration=="short") ~ "moderate_short",
    TRUE ~ "ambient"))) %>%
  left_join(filenames)
## Joining, by = "sample"

Import count data into list of dataframes (one for each library)

selecting only the first 2 columns (1=gene ID, 2=count); rename columns

file_list <- vector(mode = "list", length = nrow(filenames))
names(file_list) <- c(filenames$sampleID)

# Check out number of counts & number of genes for each sammple 
for (i in 1:nrow(filenames)) {
    file_list[[i]] <- data.frame(read.delim(file=files[i], header = F))[-1:-4,1:2]
    names(file_list[[i]]) <- c("gene", filenames$sampleID[i])
    print(paste("Total COUNTS,", names(file_list[[i]][2]), ":", prettyNum(sum(file_list[[i]][2]), big.mark=","), sep=" "))
    print(paste("Total GENES,", names(file_list[[i]][2]), ":", prettyNum(nrow(file_list[[i]] %>% filter(.[[2]] != 0)), big.mark=","), sep=" "))
}
## [1] "Total COUNTS, S1 : 27,356,371"
## [1] "Total GENES, S1 : 15,813"
## [1] "Total COUNTS, S10 : 25,950,594"
## [1] "Total GENES, S10 : 16,458"
## [1] "Total COUNTS, S11 : 23,538,132"
## [1] "Total GENES, S11 : 16,122"
## [1] "Total COUNTS, S12 : 28,466,091"
## [1] "Total GENES, S12 : 15,936"
## [1] "Total COUNTS, S13 : 26,243,223"
## [1] "Total GENES, S13 : 15,608"
## [1] "Total COUNTS, S14 : 32,556,291"
## [1] "Total GENES, S14 : 16,257"
## [1] "Total COUNTS, S15 : 27,267,105"
## [1] "Total GENES, S15 : 16,201"
## [1] "Total COUNTS, S16 : 27,765,649"
## [1] "Total GENES, S16 : 16,008"
## [1] "Total COUNTS, S17 : 25,949,479"
## [1] "Total GENES, S17 : 16,866"
## [1] "Total COUNTS, S18 : 23,261,655"
## [1] "Total GENES, S18 : 15,837"
## [1] "Total COUNTS, S19 : 24,146,983"
## [1] "Total GENES, S19 : 14,964"
## [1] "Total COUNTS, S2 : 19,577,558"
## [1] "Total GENES, S2 : 16,169"
## [1] "Total COUNTS, S20 : 22,467,848"
## [1] "Total GENES, S20 : 15,491"
## [1] "Total COUNTS, S21 : 19,897,996"
## [1] "Total GENES, S21 : 14,782"
## [1] "Total COUNTS, S22 : 24,263,123"
## [1] "Total GENES, S22 : 15,418"
## [1] "Total COUNTS, S23 : 23,770,831"
## [1] "Total GENES, S23 : 15,369"
## [1] "Total COUNTS, S24 : 21,012,750"
## [1] "Total GENES, S24 : 15,959"
## [1] "Total COUNTS, S25 : 26,330,257"
## [1] "Total GENES, S25 : 15,783"
## [1] "Total COUNTS, S26 : 23,863,525"
## [1] "Total GENES, S26 : 16,665"
## [1] "Total COUNTS, S27 : 22,485,686"
## [1] "Total GENES, S27 : 15,065"
## [1] "Total COUNTS, S28 : 19,402,036"
## [1] "Total GENES, S28 : 15,178"
## [1] "Total COUNTS, S29 : 22,378,027"
## [1] "Total GENES, S29 : 15,167"
## [1] "Total COUNTS, S3 : 22,297,607"
## [1] "Total GENES, S3 : 15,569"
## [1] "Total COUNTS, S30 : 25,014,821"
## [1] "Total GENES, S30 : 14,932"
## [1] "Total COUNTS, S31 : 20,806,911"
## [1] "Total GENES, S31 : 15,612"
## [1] "Total COUNTS, S32 : 26,829,852"
## [1] "Total GENES, S32 : 16,496"
## [1] "Total COUNTS, S33 : 24,993,968"
## [1] "Total GENES, S33 : 15,019"
## [1] "Total COUNTS, S34 : 21,352,629"
## [1] "Total GENES, S34 : 14,913"
## [1] "Total COUNTS, S35 : 23,653,101"
## [1] "Total GENES, S35 : 15,108"
## [1] "Total COUNTS, S36 : 19,468,532"
## [1] "Total GENES, S36 : 14,173"
## [1] "Total COUNTS, S37 : 20,369,652"
## [1] "Total GENES, S37 : 15,415"
## [1] "Total COUNTS, S38 : 24,213,565"
## [1] "Total GENES, S38 : 15,817"
## [1] "Total COUNTS, S39 : 27,002,471"
## [1] "Total GENES, S39 : 15,327"
## [1] "Total COUNTS, S4 : 14,176,786"
## [1] "Total GENES, S4 : 14,240"
## [1] "Total COUNTS, S40 : 28,297,886"
## [1] "Total GENES, S40 : 15,910"
## [1] "Total COUNTS, S41 : 26,946,623"
## [1] "Total GENES, S41 : 16,173"
## [1] "Total COUNTS, S42 : 29,231,086"
## [1] "Total GENES, S42 : 15,712"
## [1] "Total COUNTS, S43 : 21,222,982"
## [1] "Total GENES, S43 : 15,818"
## [1] "Total COUNTS, S44 : 44,505,472"
## [1] "Total GENES, S44 : 17,626"
## [1] "Total COUNTS, S45 : 493,648"
## [1] "Total GENES, S45 : 7,947"
## [1] "Total COUNTS, S46 : 21,482,663"
## [1] "Total GENES, S46 : 16,511"
## [1] "Total COUNTS, S47 : 25,423,628"
## [1] "Total GENES, S47 : 16,491"
## [1] "Total COUNTS, S48 : 24,483,637"
## [1] "Total GENES, S48 : 15,963"
## [1] "Total COUNTS, S49 : 23,676,454"
## [1] "Total GENES, S49 : 15,556"
## [1] "Total COUNTS, S5 : 21,763,794"
## [1] "Total GENES, S5 : 15,299"
## [1] "Total COUNTS, S50 : 25,478,463"
## [1] "Total GENES, S50 : 15,830"
## [1] "Total COUNTS, S51 : 27,365,340"
## [1] "Total GENES, S51 : 16,569"
## [1] "Total COUNTS, S52 : 23,270,277"
## [1] "Total GENES, S52 : 16,432"
## [1] "Total COUNTS, S53 : 32,346,325"
## [1] "Total GENES, S53 : 16,699"
## [1] "Total COUNTS, S54 : 36,722,531"
## [1] "Total GENES, S54 : 16,760"
## [1] "Total COUNTS, S55 : 28,187,412"
## [1] "Total GENES, S55 : 16,235"
## [1] "Total COUNTS, S56 : 33,181,676"
## [1] "Total GENES, S56 : 16,715"
## [1] "Total COUNTS, S57 : 25,071,047"
## [1] "Total GENES, S57 : 15,657"
## [1] "Total COUNTS, S58 : 26,296,226"
## [1] "Total GENES, S58 : 16,251"
## [1] "Total COUNTS, S59 : 23,712,683"
## [1] "Total GENES, S59 : 15,277"
## [1] "Total COUNTS, S6 : 20,900,574"
## [1] "Total GENES, S6 : 14,864"
## [1] "Total COUNTS, S60 : 25,561,202"
## [1] "Total GENES, S60 : 15,847"
## [1] "Total COUNTS, S61 : 32,684,320"
## [1] "Total GENES, S61 : 16,560"
## [1] "Total COUNTS, S62 : 45,169,493"
## [1] "Total GENES, S62 : 16,804"
## [1] "Total COUNTS, S63 : 23,263,690"
## [1] "Total GENES, S63 : 15,894"
## [1] "Total COUNTS, S7 : 16,502,090"
## [1] "Total GENES, S7 : 13,939"
## [1] "Total COUNTS, S8 : 25,240,705"
## [1] "Total GENES, S8 : 16,127"
## [1] "Total COUNTS, S9 : 25,121,288"
## [1] "Total GENES, S9 : 16,150"

Preview counts for one sample to see what gene count data looks like

file_list[[1]] %>% head(n=20)
##          gene  S1
## 5  gene_21170   0
## 6  gene_21172   0
## 7  gene_20492   0
## 8  gene_20493   2
## 9  gene_22758   0
## 10 gene_22759   0
## 11 gene_22760 114
## 12 gene_22756   0
## 13 gene_22761   7
## 14 gene_22757   1
## 15 gene_22762   2
## 16 gene_22083   0
## 17 gene_22101   0
## 18 gene_22102   0
## 19 gene_22103   2
## 20 gene_22104   0
## 21 gene_22105   0
## 22 gene_22084   0
## 23 gene_22106   1
## 24 gene_22085  98

Merge data from all samples into one dataframe

counts <- file_list %>% purrr::reduce(full_join, by = "gene") %>% column_to_rownames(var="gene")
head(counts) #resulting dataframe has genes by row, and samples by column
##            S1 S10 S11 S12 S13 S14 S15 S16 S17 S18 S19 S2 S20 S21 S22 S23 S24
## gene_21170  0   0   0   0   0   0   0   0   0   0   0  0   0   0   0   0   0
## gene_21172  0   0   0   0   0   0   0   0   0   0   0  0   0   0   0   0   0
## gene_20492  0   0   0   0   0   0   0   0   0   0   0  0   0   0   0   0   0
## gene_20493  2   0   1   0   0   1   0   0   0   0   0  0   0   0   0   3   0
## gene_22758  0  15   2   3   1   4   5   0   7   2   0  7   3   0   0   3   3
## gene_22759  0   0   0   0   0   0   0   0   0   0   0  0   0   0   0   0   0
##            S25 S26 S27 S28 S29 S3 S30 S31 S32 S33 S34 S35 S36 S37 S38 S39 S4
## gene_21170   0   0   0   0   0  0   0   0   0   0   0   0   0   0   0   0  0
## gene_21172   0   0   0   0   0  0   0   0   0   0   0   0   0   0   0   0  0
## gene_20492   0   0   0   0   0  0   0   0   0   0   0   0   0   0   0   0  0
## gene_20493   5   1   0   0   0  0   1   2   0   3   2   0   0   0   0   0  0
## gene_22758   2   0   7   4   1  7   1   0   2   0   2   7   0   5   0   4 16
## gene_22759   0   0   0   0   0  0   0   0   0   0   0   0   0   0   0   0  0
##            S40 S41 S42 S43 S44 S45 S46 S47 S48 S49 S5 S50 S51 S52 S53 S54 S55
## gene_21170   0   0   0   0   0   0   0   0   0   0  0   0   0   0   0   0   0
## gene_21172   0   0   0   0   0   0   0   0   0   0  0   0   0   0   0   0   0
## gene_20492   0   0   0   0   1   0   0   0   0   0  0   0   0   0   0   0   0
## gene_20493   0   0   1   0   3   0   1   0   0   0  2   0   1   0   0   0   0
## gene_22758   0   3   2   5  11   0   3   0   0   4  1   6   0   5   1   3   3
## gene_22759   0   0   0   0   0   0   0   0   0   0  0   0   0   0   0   0   0
##            S56 S57 S58 S59 S6 S60 S61 S62 S63 S7 S8 S9
## gene_21170   0   0   0   0  0   0   0   0   0  0  0  0
## gene_21172   0   0   0   0  0   0   0   0   0  0  0  0
## gene_20492   0   0   0   0  0   0   0   0   0  0  0  0
## gene_20493   0   0   1   1  1   0   0   2   0  0  0  0
## gene_22758   1   4   0   5  0   3   7   6   7  5  0  5
## gene_22759   0   0   0   0  0   0   0   0   0  0  0  0

Summarize counts and visualize

print(paste("Number of samples:", ncol(counts), sep=" "))
## [1] "Number of samples: 63"
print(paste("Total number of genes in dataframe:", prettyNum(nrow(counts), big.mark = ","), sep=" "))
## [1] "Total number of genes in dataframe: 47,451"
print(paste("Average number of genes per sample:", prettyNum(mean(colSums(counts != 0)), big.mark = ","), sep=" "))
## [1] "Average number of genes per sample: 15,672.27"
print(paste("Total counts, all samples:", prettyNum(sum(colSums(counts)), big.mark = ","), sep=" "))
## [1] "Total counts, all samples: 1,571,734,320"
#print(paste("Counts for", colnames(counts %>% select(contains("Tank"))), ":",  prettyNum(colSums(counts %>% select(contains("Tank"))), big.mark = ","), sep=" "))


#inspect total counts by sample - this generates an interactive plot, hover over bars to identify sample 
 ggplotly(
   ggplot(data.frame(colSums(counts)) %>% 
            dplyr::rename(count.total = 1) %>% rownames_to_column(var="sample")) + 
     geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total count by sample") + 
              theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())) 

Filter dataset

Remove entire sample(s) from dataset if needed (OPTIONAL - comment out of not needed)

Sample 45 has very few counts, remove

remove.list <- c("S45")
counts <- counts[ , -which(names(counts) %in% remove.list)]
sample.info <- sample.info[ -which(sample.info$sampleID %in% remove.list), ]

# resave sample info object
save(sample.info, file="../data/sample.info")

nrow(sample.info) == ncol(counts) #should = TRUE if sample.info and gene count matrix countain the same samples 
## [1] TRUE

Transpose dataframe so each row = a sample, and each column = genes

counts.t <- as.data.frame(t(counts))  

Remove low-frequency genes from dataset (optional, but suggested)

Here we remove genes (rows) that are expressed at very low levels. These are not likely to be biologically relevant and by reducing the number of genes to be assessed for differential expression we reduce the impact of the multiple-comparison correction. The definition of “low-frequency genes” can be adjusted - there’s no hard and fast rule - but this is what I have been using: “Genes with mean count <10 across all samples or those with counts <30 across at minimum 10% of the samples were discarded.” You’ll see that while many genes were discarded (~36,000 were dropped, out of ~47,500), the vast majority of fragments were retained (99.955%)! This is because those 36k genes only had a few reads aligning to them.

keep1 <- colMeans(counts.t, na.rm=TRUE) >= 10 #identify genes with mean count >= 10 across all samples (excluding NAs = 10)
keep2 <- rowSums( counts >= 10 ) >= 0.1*43 #identify genes with counts>=10 across at minimum 10% of the samples
keep <- unique(c(names(which(keep1 == T)), names(which(keep2 == T)))) # list of genes meeting one of the two above criteria
counts.ts <- counts.t[,keep]

# Print summary of gene filtering. 
print(paste("# genes before filtering:", ncol(counts.t)))
## [1] "# genes before filtering: 47451"
print(paste("# genes remaining after pre-filtering:", ncol(counts.ts)))
## [1] "# genes remaining after pre-filtering: 11491"
print(paste("# of genes dropped:", ncol(counts.t) - ncol(counts.ts), sep=" "))
## [1] "# of genes dropped: 35960"
print(paste("% of fragments remaining after pre-filtering: ", signif(100*sum(counts.ts, na.rm = T)/sum(counts.t, na.rm = T), digits = 5), "%", sep=""))
## [1] "% of fragments remaining after pre-filtering: 99.955%"
print(paste("Number of fragments dropped: ", signif(sum(counts.t, na.rm = T)-sum(counts.ts, na.rm = T), digits = 5)))
## [1] "Number of fragments dropped:  712310"
print(paste("% of fragments dropped: ", signif(100*(sum(counts.t, na.rm = T)-sum(counts.ts, na.rm = T))/sum(counts.t, na.rm = T), digits = 5), "%", sep=""))
## [1] "% of fragments dropped: 0.045334%"
print(paste("Number of fragments remaining: ", signif(sum(counts.ts, na.rm = T), digits = 5)))
## [1] "Number of fragments remaining:  1570500000"

Save counts file, and transformed counts file - to be used in analyses!

save(counts, file = "../results/counts")
save(counts.t, file = "../results/counts.t")
save(counts.ts, file = "../results/counts.ts")

Generate annotation objects

Many of the enrichment analyses I run require Uniprot IDs for each gene. Not all annotation files for genomes (or transcriptomes) have Uniprot ID’s associated with each gene. So, to get these ID’s I run a program called Blast to match the sequences of known genes in the reference genome to sequences in the Uniprot/Swissprot database. Blast is run on Sedna or Mox and takes a while (hours to days, depending on the number/size of gene sequences). Before doing that, though, I need to create a .bed file which identifies the location of each gene in the genome. Once I create a bed file listing genes, I will use that with getfasta from bedtools (another bash language program) to extract gene sequences, which I will then use with blast to identify gene function

Generate .bed file of genes for Blasting against Uniprot/Swissprot

NOTE: GFF format use a 1-based coordinate system, while BED format uses a 0-based coordinate system. I therefore need to convert my coordinates. Check out this helpful coordinate system summary.

read.delim(file = "../references/final_annotation.gff", sep = "", header = F) %>%
  mutate(V3=as.factor(V3)) %>% filter(V3=="gene") %>%
  select(V1, V4, V5, V9) %>%
  mutate(V4=as.numeric(V4), V5=as.numeric(V5)) %>%
  mutate(V4=V4-1) %>% #convert from 1-based to 0-based by subtracting 1 from the start position
  mutate(ID=str_extract(V9, "ID=(.*?);")) %>%
  mutate(ID=str_remove(ID, ";")) %>% mutate(ID=str_remove(ID, "ID=")) %>%
  select(V1,V4,V5,ID) %>%
  write.table(., "../references/tanner_genes.bed", sep = "\t",
              col.names = F, row.names = F, quote = F)

Check out resulting .bed file.

This has 4 columns: 1. Chromosome/contig ID (on tanner genome), 2. Gene start locus 3. Gene end locus, 4. Gene ID

head ../references/tanner_genes.bed
## ptg000001l   307959  309359  gene_21170
## ptg000001l   308964  309738  gene_21172
## ptg000002l   40323   41936   gene_20492
## ptg000002l   42740   43499   gene_20493
## ptg000003l   6099    7875    gene_22758
## ptg000003l   8823    9462    gene_22759
## ptg000003l   28603   41091   gene_22760
## ptg000003l   48542   50825   gene_22756
## ptg000003l   61793   67795   gene_22761
## ptg000003l   64361   65873   gene_22757

Run blast on Sedna/Mox to annotate genes.

Used the script blast-tanner.sh, which needs the following inputs:
1. The most current Uniprot/Swissprot database, which can be downloaded from https://www.uniprot.org/help/downloads. Select the “Reviewed (Swiss-Prot)” in fasta format.
2. The genome fasta file that was used for aligning reads. E.g. the tanner fasta was tanner.asm.hic.p_ctg.fa. Before blasting, the script uses bedtools getfasta to grab the sequences for each gene from the genome. It’s critical that the annotation file (.gff) that was used in the previous step to generate the .bed file is associated with the same genome file!
3. The programs blast and bedtools need to be installed and loaded (the above script shows how they can be loaded on Mox)

Once the blast results have finished, proceed to next step!

Import Tanner genome annotation blast results (Uniprot/Swissprot annotated genes)

NOTE: To run the below code you’ll need the file tanner_genes_blastx.tab. It’s very large, so grab it from Google Drive and save to the snowcrab/references/ directory.

blastx output format 6 is tab file with the following columns:
1. qaccver = Query accesion.version
2. saccver = Subject accession.version
3. pident = Percentage of identical matches
4. length = Alignment length
5. mismatch = Number of mismatches
6. gapopen = Number of gap openings
7. qstart = Start of alignment in query
8. qend = End of alignment in query
9. sstart = Start of alignment in subject
10. send = End of alignment in subject
11. evalue = Expect value
12. bitscore = Bit score

tanner.blast <- 
  read_delim(file = "../references/tanner_genes_blastx.tab", delim = "\t", 
                           col_names = c("qaccver", "saccver", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore")) %>%
  separate(qaccver, sep = ":|-", into = c("SeqID","Start","End")) %>%
  mutate_at(vars(Start, End), as.numeric) %>%
  separate(saccver, sep="\\|", into=c("na", "SPID", "gene.Uni"), remove = F) %>%
  dplyr::select(-na) %>% separate(gene.Uni, sep="_", into=c("gene.Uni", "species"), remove=T) %>%
  group_by(SeqID, Start, End) %>% dplyr::slice(which.min(evalue))  %>% # where multiple blast hits for same gene, select one with minimum e-value
  left_join(., read_delim(file="../references/tanner_genes.bed", delim="\t", col_names = c("SeqID", "Start", "End", "GeneID"))) %>%
  dplyr::select(GeneID, everything()) %>%
  clean_names()
## Rows: 2370863 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): qaccver, saccver
## dbl (10): pident, length, mismatch, gapopen, qstart, qend, sstart, send, eva...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 47451 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): SeqID, GeneID
## dbl (2): Start, End
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = c("SeqID", "Start", "End")
# E-VALUE FILTERING
# Count annotated genes with e-value filters
tanner.blast %>% filter(evalue < 1.0e-10) %>% nrow() # 12,993 genes have e-value <1e-10
## [1] 12993
tanner.blast %>% filter(evalue < 1.0e-15) %>% nrow() # 9,979 genes have e-value <1e-15
## [1] 9979
tanner.blast %>% filter(evalue < 1.0e-20) %>% nrow() # 7,252 genes have e-value <1e-20
## [1] 7252
# Filter blast results to meet desired e-value threshold - here we choose 1e-10
tanner.blast <- tanner.blast %>% filter(evalue < 1.0e-10)

save(tanner.blast, file = "../references/tanner.blast")
load(file = "../references/tanner.blast")

Annotate with Gene Ontology (GO) terms and other info

NOTE: I needed GO IDs and gene functions, which weren’t included in the blast file. So I copied the column containing all Uniprot IDs from the blast object, then pasted those into the tool Uniprot batch retrieval tool (https://www.uniprot.org/uploadlists/), and selected the columns: Entry, Entry name, Protein names, Gene names, Organism, Gene ontology (biological process), Gene ontology (cellular componenet), Gene ontology (molecular processes), Gene ontology (GO), Gene ontology Ids. I then downloaded all entries to a tab file, saved as: /references/tanner_genes_GO.tsv. Now I’ll read that into R and join with the tanner.blast dataframe to link GO IDs with genes for exploration and enrichment analyses.

# Use this code to get list of Uniprot SPID (gene identifiers) to input into the Uniprot Batch Retrieval tool, to obtain GO terms for each gene
tanner.blast %>% ungroup() %>% dplyr::select(spid) %>% distinct() %>%
  na.omit() %>% unlist() %>% as.vector() %>% write_clip(allow_non_interactive = TRUE)

# After downloading the tanner_genes_GO.tsv file, add GO terms to Uniprot annotation object
tanner.blast.GO <- left_join(tanner.blast, read_delim(file = "../references/tanner_genes_GO.tsv", 
                                                      delim = "\t") %>% clean_names(),
                             by = c("spid"="entry")) %>% ungroup()
## Rows: 3776 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): From, Entry, Entry Name, Protein names, Gene Names, Organism, Gene...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
save(tanner.blast.GO, file = "../references/tanner.blast.GO")
#load(file = "../references/tanner.blast.GO")

# Preview annotated list of genes with their Gene Ontology information, which describes the various functions performed by each gene
head(tanner.blast.GO)
## # A tibble: 6 × 28
##   gene_id    seq_id    start    end saccver spid  gene_uni species pident length
##   <chr>      <chr>     <dbl>  <dbl> <chr>   <chr> <chr>    <chr>    <dbl>  <dbl>
## 1 gene_22760 ptg0000…  28603  41091 sp|Q9D… Q9D9… F186A    MOUSE     29.5    370
## 2 gene_22762 ptg0000…  67966  70416 sp|Q9U… Q9UL… ANR50    HUMAN     27.5    236
## 3 gene_22083 ptg0000…  16091  16581 sp|Q8I… Q8IZ… F200C    HUMAN     33.3     93
## 4 gene_22106 ptg0000… 156430 157971 sp|O75… O755… JERKY    HUMAN     41.1    331
## 5 gene_22085 ptg0000… 170352 178379 sp|Q6P… Q6P6… DDB1     XENLA     83.3     78
## 6 gene_22090 ptg0000… 254082 260100 sp|Q3U… Q3U1… DDB1     MOUSE     72.4     58
## # … with 18 more variables: mismatch <dbl>, gapopen <dbl>, qstart <dbl>,
## #   qend <dbl>, sstart <dbl>, send <dbl>, evalue <dbl>, bitscore <dbl>,
## #   from <chr>, entry_name <chr>, protein_names <chr>, gene_names <chr>,
## #   organism <chr>, gene_ontology_biological_process <chr>,
## #   gene_ontology_cellular_component <chr>, gene_ontology_go <chr>,
## #   gene_ontology_molecular_function <chr>, gene_ontology_i_ds <chr>

Annotate gene counts dataframe

Add gene name and functional information to each gene in our snow crab dataset!

counts.annot.tanner <- right_join(
  tanner.blast %>% ungroup() %>% dplyr::select(gene_id, spid, gene_uni, species, pident, evalue, gene_id, seq_id, start, end, gene_id),
  counts.ts %>% t() %>% as.data.frame() %>% rownames_to_column("gene_id"), "gene_id") %>%
  left_join(tanner.blast.GO %>% dplyr::select(gene_id, protein_names, gene_names), "gene_id") 

save(counts.annot.tanner, file = "../results/counts.annot.tanner")
#load(file = "../results/counts.annot.tanner")
write.csv(counts.annot.tanner, file = "../results/counts.annot.tanner.csv", row.names = F, quote = F) #save annotated gene information to .csv file

How many of our analyzed genes are annotated? Answer: 5,808 (out of 11,491 genes that we will analyze)

right_join(
  tanner.blast %>% ungroup() %>% dplyr::select(gene_id, spid, gene_uni, species, pident, evalue),
  counts.ts %>% t() %>% as.data.frame() %>% rownames_to_column(var = "gene_id"), "gene_id") %>% filter(!is.na(spid)) %>%
  nrow()
## [1] 5808

Here’s the total number of genes in the tanner gff: 47,449 genes

read_delim(file = "../references/final_annotation.gff", delim = "\t", col_names = F, skip = 8) %>% 
  mutate(X3=as.factor(X3)) %>% filter(X3=="gene") %>% nrow()
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 257189 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): X1, X2, X3, X6, X7, X8, X9
## dbl (2): X4, X5
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] 47449